library(plyr)
library(dplyr)
library(tidyverse)
library(lmtest)
library(data.table)
library(nlme)
library(EnvStats)

#Getting dataset
data_org= read.csv("Graduation.csv")
summary(data_org)
view(data_org)

#1)Calculating the Crude death rates
data2<-data_org
data2$CRUDE=data_org$DEATHS/data_org$ETR
view(data2)

#2)Calculating Graduated rates using Gompertz law
model= lm(log(CRUDE)~AGE, data= data2)
summary(model)
coeff=as.numeric(coef(model))
B=exp(coeff[1])
C=coeff[2]
B
C
data2$GRADUATED=B*exp(C*data2$AGE)
view(data2)

#3)Checking for smoothness
first_diff_G=diff(data2$GRADUATED)
second_diff_G=diff(first_diff)
third_diff_G=diff(second_diff)
third_diff_G

first_diff_C=diff(data2$CRUDE)
second_diff_C=diff(first_diff)
third_diff_C=diff(second_diff)
third_diff_C

#the values are very low compared to the original values and they progress regularly
#the data is smooth

#4)Chi-Square test
data2$EXPECTED=data2$GRADUATED*data2$ETR
data2$ZX=(data2$DEATHS-data2$EXPECTED)/(sqrt(data2$EXPECTED))
chisq_data=data.frame(data2$DEATHS,data2$EXPECTED)
view(chisq_data)
chisq.test(chisq_data)
View(data2)

#5a)Standardized deviations test
table(cut(data2$ZX, breaks = seq.int(from = -20,to = 20, by= 4)))
#i. The graph is very wide compared to a standard normal graph, with several values higher than abs(10)
#ii. the Values of the absolute deviations are very high relative to the expected value
#iii. The lower bound is approx. -2 and the upper bound is approx. 6. IQR is 8. There are some outliers on the lower side but none on the upper side
#iv. The Graph is positively skewed. It looks like it is centered about 2
#v. The graduated rates do not represent the underlying mortality rates with accuracy

#5b)Sign test
signtest=sign(data2$ZX)
table(signtest)
dbinom(31,51,0.5)
#since it's a two tailed test, at 5% significance level, we accept the NULL hypothesis that the data is a true representation of the underlying mortality rates.

#5c)Cumulative deviations test
obs=sum(data2$DEATHS)
exp=sum(data2$EXPECTED)
z=(obs-exp)/exp
pnorm(-z)
#at 5% significance level, we accept the NULL hypothesis that the data is a true representation of the underlying mortality rates.

#5d)Serial Correlations test
serialCorrelationTest(data2$ZX)
#at 5% significance level, we accept the NULL hypothesis that the data is a true representation of the underlying mortality rates.